function _ctPdfViewport(page, scale) { try { return page.getViewport({ scale: scale }); } catch (e) { return page.getViewport(scale); } } function processFile(blob, fileName) { var language = (($('.extra-input input').first().val() || 'eng') + '').trim().toLowerCase(); if (!language) language = 'eng'; Promise.all([ loadScriptPromise('https://cdn.jsdelivr.net/npm/pdf-lib@1.17.1/dist/pdf-lib.min.js'), new Promise(function(resolve) { if (window.PDFJS && PDFJS.getDocument) { resolve(); return; } var s = document.createElement('script'); s.src = '/js/pdf.js'; s.onload = resolve; s.onerror = resolve; document.head.appendChild(s); }), loadScriptPromise('https://cdn.jsdelivr.net/npm/tesseract.js@5.1.1/dist/tesseract.min.js') ]).then(async function() { try { var inputBytes = new Uint8Array(await blob.arrayBuffer()); PDFJS.workerSrc = '/js/pdf.worker.js'; var loadingTask; try { loadingTask = PDFJS.getDocument(inputBytes); } catch (e) { loadingTask = PDFJS.getDocument({ data: inputBytes }); } var srcPdf = (loadingTask && loadingTask.promise) ? await loadingTask.promise : await loadingTask; var outPdf = await PDFLib.PDFDocument.create(); var font = await outPdf.embedFont(PDFLib.StandardFonts.Helvetica); var renderScale = 2.0; for (var p = 1; p <= srcPdf.numPages; p++) { var srcPage = await srcPdf.getPage(p); var viewport = _ctPdfViewport(srcPage, renderScale); var canvas = document.createElement('canvas'); canvas.width = Math.ceil(viewport.width); canvas.height = Math.ceil(viewport.height); await srcPage.render({ canvasContext: canvas.getContext('2d'), viewport: viewport }).promise; var pngBlob = await new Promise(function(resolve) { canvas.toBlob(resolve, 'image/png'); }); if (!pngBlob) throw new Error('Could not render PDF page image for OCR.'); var pngBytes = new Uint8Array(await pngBlob.arrayBuffer()); var embeddedImage = await outPdf.embedPng(pngBytes); var outPage = outPdf.addPage([viewport.width, viewport.height]); outPage.drawImage(embeddedImage, { x: 0, y: 0, width: viewport.width, height: viewport.height }); var result = await Tesseract.recognize(canvas.toDataURL('image/png'), language); var words = (result && result.data && result.data.words) ? result.data.words : []; for (var w = 0; w < words.length; w++) { var word = words[w]; var txt = (word.text || '').trim(); if (!txt) continue; var b = word.bbox || {}; var x = b.x0 || 0; var yTop = b.y0 || 0; var yBottom = b.y1 || yTop; var size = Math.max(6, Math.min(72, (yBottom - yTop) || 10)); var y = viewport.height - yBottom; outPage.drawText(txt, { x: x, y: y, size: size, font: font, opacity: 0 }); } } var outBytes = await outPdf.save({ useObjectStreams: true }); var base = (fileName || 'document').replace(/\.pdf$/i, ''); add_file_output(URL.createObjectURL(new Blob([outBytes], { type: 'application/pdf' })), base + '-ocr.pdf'); } catch (err) { alert('Could not OCR this PDF. Try a smaller file, fewer pages, or language code "eng".'); } }).catch(function() { alert('Could not load OCR/PDF libraries in your browser.'); }); } var _loadedScripts = {}; function loadScriptPromise(url) { if (_loadedScripts[url]) return _loadedScripts[url]; _loadedScripts[url] = new Promise(function (resolve, reject) { var s = document.createElement('script'); s.src = url; s.onload = resolve; s.onerror = reject; document.head.appendChild(s); }); return _loadedScripts[url]; } function replaceAll(find, replace, str) { return str.replace(new RegExp(find, 'g'), replace); } function beautify(str) { var result = ''; var length = str.length; var i = 0; var braceCountLeft = 0; var braceCountRight = 0; var withinQuotes = false; while (i < length) { var c = str[i]; if (c == '"' && (i == 0 || c[i - 1] != '\\')) { // non-escaped quotes withinQuotes = !withinQuotes; } if (!withinQuotes && (c == '}' || c == '{' || c == ',')) { console.log('Start####' + result); // look back and remove carriage returns and whitespace that are already there var resultIndex = result.length - 1; while (resultIndex >= 0 && (result[resultIndex] == ' ' || result[resultIndex] == '\r' || result[resultIndex] == '\n' || result[resultIndex] == '\t')) { resultIndex = resultIndex - 1; result = result.substr(0, resultIndex + 1); console.log('char ' + result[resultIndex] + '-----' + result + 'zzz ' + result.length + ' ' + resultIndex); } if (c == '{') { braceCountLeft++; result += c + '\r' + GetTabs(braceCountLeft - braceCountRight); } else if (c == '}') { braceCountRight++; // precede with carriage return result += '\r' + GetTabs(braceCountLeft - braceCountRight) + c; } else if (c == ',') { result += c + '\r' + GetTabs(braceCountLeft - braceCountRight); } var nextChar = ''; // advance through whitespace and remove carriage returns that are already there while (i < length && (str[i + 1] == ' ' || str[i + 1] == '\r' || str[i + 1] == '\n' || str[i + 1] == '\t')) { i++; } } else { result += str[i]; } i++; } return result; } function GetTabs(count) { var result = ''; for (var i = 0; i < count; i++) { result += ' '; } return result; }